Chapter 3 Data statistics

load("data/data.Rdata")

3.1 Sequencing reads statistics

sample_metadata %>% 
    summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
              sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
    tt()
tinytable_mcr1zj1g3l9niccqmeti
Total Average
398.22 4.33 ± 1.45

3.2 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(sample_metadata, by = join_by(sample == sample)) %>%
    dplyr::select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
    mutate(mags_bases = mags*146) %>%
    mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
    mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
    dplyr::select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)

sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()
tinytable_c9nj5xsmen0wj5ukr8wy
Sample Low quality Mapped to host Unmapped Mapped to MAGs
EHI01436 0.10323450 0.002155664 1.9223563 0.14057376
EHI01437 0.14688974 0.001776019 1.3951997 3.75777530
EHI01438 0.11341031 0.001443667 1.0781908 2.93324556
EHI01439 0.09710566 0.000553848 0.7633965 2.39854085
EHI01440 0.08161119 0.000412702 0.6588022 2.80294070
EHI01441 0.09842650 0.001532668 0.9259464 2.26467053
EHI01442 0.09044909 0.003948668 1.3099571 1.91223033
EHI01443 0.08349080 0.001670709 0.7703155 1.97118600
EHI01444 0.15356698 0.008828658 1.8761546 2.25276496
EHI01445 0.24321310 0.000493809 0.6808145 2.86800064
EHI01447 0.07308100 0.000928745 0.5335369 1.86430335
EHI01448 0.16654049 0.001447619 1.2343569 4.00302946
EHI01449 0.08086772 0.000712056 1.9015472 0.56772173
EHI01450 0.09513485 0.000808493 0.6801877 2.40271908
EHI01451 0.15486528 0.006054971 1.2002374 3.09457074
EHI01584 0.31311426 0.001124632 1.1945509 4.46459065
EHI01585 0.43461750 0.010705116 4.7173126 5.56319787
EHI01610 0.09609182 0.002231717 0.7319288 2.22137350
EHI01611 0.12854248 0.002113926 2.0768251 1.11056915
EHI01612 0.09445979 0.000396947 0.8434826 2.27477884
EHI01613 0.11771235 0.003535185 0.8236243 2.34411760
EHI01616 0.13311594 0.000846441 1.0689190 2.28105071
EHI01617 0.10496582 0.000600034 0.7624687 1.66612338
EHI01618 0.15796519 0.001938639 1.4700412 1.45306500
EHI01619 0.21423836 0.001610432 2.1137921 2.57152267
EHI01620 0.15125219 0.002882104 1.5452853 1.20720363
EHI01621 0.14437434 0.000908440 0.8938030 2.84473642
EHI01623 0.13905131 0.000111578 0.8158237 2.89356495
EHI01625 0.13457413 0.004361014 0.9973074 3.14405452
EHI01627 0.11510846 0.000652453 0.8491846 2.40277675
EHI01628 0.13073679 0.002521283 1.2403353 1.86084796
EHI01631 0.09932800 0.001705834 0.8044096 1.77531722
EHI02085 0.57919563 0.018462021 2.6410532 1.29059649
EHI02095 0.48834211 0.013513952 3.4049831 3.18378200
EHI02104 0.53195157 0.007010599 2.7354830 0.81316072
EHI02109 1.28374828 0.008847543 4.9223937 0.24490536
EHI02115 0.92901178 0.003428911 3.9141515 0.19065498
EHI02128 0.96743572 0.007418441 4.7627289 0.16898799
EHI02571 0.13287769 0.000830065 1.3326271 2.42337662
EHI02572 0.23097718 0.001104376 4.2180228 0.66197962
EHI02573 0.23772138 0.000919831 1.4201215 2.90177102
EHI02574 0.18641515 0.001105944 1.2909807 2.51445595
EHI02575 0.16800006 0.000451116 0.7701716 3.10908854
EHI02576 0.27054446 0.001466596 1.2373298 4.37179582
EHI02577 0.13875010 0.000159264 0.6656173 3.11920576
EHI02578 0.18840997 0.000370817 0.9281657 3.79342865
EHI02579 0.17040671 0.000316117 1.1304582 2.89912711
EHI02580 0.14585212 0.002285096 1.1005491 2.55888783
EHI02581 0.31534501 0.004222759 1.1406810 3.45948869
EHI02583 0.19024490 0.002688647 2.5186672 1.41498835
EHI02586 0.16808497 0.000634686 1.2191413 2.86012073
EHI02588 0.22724830 0.000620576 1.3616533 3.96253913
EHI02589 0.17868765 0.001172553 1.3032228 2.88262020
EHI02590 NA NA NA 0.01003195
EHI02591 0.24672078 0.005870858 5.1327559 0.11609000
EHI02593 0.28535188 0.001047289 1.7734727 3.51223396
EHI02594 0.14799113 0.001254245 2.2455458 0.78283346
EHI02595 0.17594927 0.000390443 0.8947388 2.65116816
EHI02596 0.19552510 0.000402096 0.8853602 3.90930111
EHI02597 0.16997179 0.003302624 1.8718822 2.27029869
EHI02599 0.24535794 0.002994709 1.6290468 3.25617683
EHI02600 0.16539330 0.000372951 2.5885522 0.78914504
EHI02601 0.20440033 0.000546487 1.2005389 3.29478799
EHI02604 0.13350290 0.000033019 0.5459000 2.24831357
EHI02605 0.23336073 0.000578456 0.7867636 4.00240210
EHI02606 0.21269906 0.000840830 0.9069484 4.52838330
EHI02608 0.16608092 0.000868246 1.2007537 2.21539480
EHI02609 0.18162303 0.000620091 1.2032065 3.45803993
EHI02610 0.18029349 0.000359149 1.0625221 3.04269811
EHI02611 0.14935861 0.000280047 0.9678059 2.31031466
EHI02613 0.13705908 0.001431597 0.6736314 2.75385288
EHI02614 0.18096790 0.000783821 1.1284081 2.27501989
EHI02616 0.14713007 0.000239190 0.5108695 2.35465880
EHI02618 0.17147141 0.000675857 0.9085123 2.63753628
EHI02620 0.15234767 0.000669926 0.8504433 2.26631084
EHI02621 0.18176981 0.000880545 2.0099358 2.33263631
EHI02622 0.20546598 0.000285185 1.1929782 3.39019490
EHI02623 0.14304619 0.000230072 0.5719310 2.72557253
EHI02626 0.20016240 0.000600853 0.9178175 2.61054132
EHI02627 0.16663385 0.001572916 2.4785902 0.65110510
EHI02628 0.14066292 0.000620136 0.5260281 1.86407529
EHI02629 0.15442873 0.001035638 1.3085973 2.62663840
EHI02631 0.28751065 0.003383238 4.2392404 1.34326512
EHI02634 0.27934868 0.001091891 2.4750414 3.24088595
EHI02635 0.30611130 0.001240215 1.3218018 4.73514062
EHI02636 0.33858819 0.000408731 1.2870087 6.26143207
EHI02637 0.25767107 0.000313024 1.3477516 5.69477117
EHI02638 0.26777696 0.001869868 1.9786471 3.30618562
EHI02640 0.20192954 0.002568987 1.1840044 3.29878050
EHI02641 0.53193661 0.006973676 3.8010965 4.56692336
EHI02642 0.35162886 0.000765507 5.4173877 1.45875812
EHI02643 0.33426863 0.001865962 5.7701855 1.11141741
EHI02646 0.35059944 0.008032777 6.5018324 1.15256692
sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
  tt()
tinytable_kbzq2wwfayglwjfsvv4u
Low quality Mapped to host Unmapped Mapped to MAGs
0.2261139 0.002232012 1.70865 2.507462
sequence_fractions %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
  left_join(sample_metadata, by = join_by(sample == sample))%>%
    ggplot(., aes(x = sample, y = value, fill=fraction)) +
        geom_bar(position="stack", stat = "identity") +
      scale_fill_manual(name="Sequence type",
                    breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
  facet_grid(~region, scales="free")+
        labs(x = "Samples", y = "Amount of data (GB)") +
        theme_classic() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

3.3 Recovered microbial fraction

singlem_table <- sequence_fractions %>%
    mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
    left_join(sample_metadata, by = join_by(sample == sample))  %>%
    mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
    dplyr::select(sample,mags_proportion,singlem_proportion) %>%
    mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
    mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify

singlem_table %>%
    pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
    mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
    ggplot(., aes(x = value, y = sample, color=proportion)) +
            geom_line(aes(group = sample), color = "#f8a538") +
            geom_point() +
      scale_color_manual(name="Proportion",
                    breaks=c("mags_proportion","singlem_proportion"),
                    labels=c("Recovered","Estimated"),
                    values=c("#52e1e8", "#876b53"))+
            theme_classic() +
            labs(y = "Samples", x = "Prokaryotic fraction (%)") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "right")

damr <- singlem_table %>%
  mutate(damr=ifelse(is.na(singlem_proportion),100,mags_proportion/singlem_proportion*100)) %>%
  dplyr::select(sample,damr)

damr %>% 
  tt()
tinytable_l1uxnu6wyrnt4mmhwnpg
sample damr
EHI01436 14.498616
EHI01437 99.116488
EHI01438 98.837524
EHI01439 100.000000
EHI01440 100.000000
EHI01441 100.000000
EHI01442 95.602448
EHI01443 100.000000
EHI01444 95.853830
EHI01445 100.000000
EHI01447 94.747746
EHI01448 100.000000
EHI01449 39.258880
EHI01450 100.000000
EHI01451 100.000000
EHI01584 100.000000
EHI01585 100.000000
EHI01610 100.000000
EHI01611 59.100933
EHI01612 100.000000
EHI01613 100.000000
EHI01616 100.000000
EHI01617 100.000000
EHI01618 100.000000
EHI01619 100.000000
EHI01620 100.000000
EHI01621 100.000000
EHI01623 100.000000
EHI01625 100.000000
EHI01627 100.000000
EHI01628 100.000000
EHI01631 100.000000
EHI02085 100.000000
EHI02095 100.000000
EHI02104 46.583977
EHI02109 12.599681
EHI02115 8.541973
EHI02128 8.243211
EHI02571 100.000000
EHI02572 24.180328
EHI02573 100.000000
EHI02574 85.807038
EHI02575 100.000000
EHI02576 100.000000
EHI02577 100.000000
EHI02578 100.000000
EHI02579 100.000000
EHI02580 100.000000
EHI02581 100.000000
EHI02583 64.485479
EHI02586 100.000000
EHI02588 100.000000
EHI02589 100.000000
EHI02590 100.000000
EHI02591 79.211470
EHI02593 100.000000
EHI02594 48.390116
EHI02595 100.000000
EHI02596 100.000000
EHI02597 100.000000
EHI02599 100.000000
EHI02600 41.916383
EHI02601 100.000000
EHI02604 100.000000
EHI02605 100.000000
EHI02606 100.000000
EHI02608 100.000000
EHI02609 100.000000
EHI02610 100.000000
EHI02611 100.000000
EHI02613 100.000000
EHI02614 100.000000
EHI02616 100.000000
EHI02618 100.000000
EHI02620 100.000000
EHI02621 100.000000
EHI02622 100.000000
EHI02623 100.000000
EHI02626 100.000000
EHI02627 38.053421
EHI02628 100.000000
EHI02629 100.000000
EHI02631 42.964286
EHI02634 100.000000
EHI02635 100.000000
EHI02636 100.000000
EHI02637 100.000000
EHI02638 93.150685
EHI02640 100.000000
EHI02641 98.715862
EHI02642 36.752729
EHI02643 49.328039
EHI02646 28.060369
damr %>% 
  filter(damr>95) %>%
  nrow()
[1] 73
damr %>% 
  summarise(mean=mean(damr),sd=sd(damr)) %>% 
  tt()
tinytable_uw94fuv33ft4opfpzr1s
mean sd
88.21507 25.46559
samples_50damr <- damr %>% 
  filter(damr>50) %>% 
  select(sample)

3.3.1 Samples discarded (damr < 80)

samples_lowdamr <- damr %>% 
  filter(damr<80) %>% 
  select(sample)

sample_metadata[sample_metadata$sample %in% samples_lowdamr$sample,] %>% count(region, type, season, sort = TRUE)
# A tibble: 8 × 4
  region    type      season     n
  <chr>     <chr>     <chr>  <int>
1 Irun      natural   autumn     6
2 Aia       protected autumn     3
3 Orio      urban     autumn     2
4 Usurbil   natural   spring     2
5 Usurbil   natural   autumn     1
6 Villabona natural   autumn     1
7 Zarautz   protected autumn     1
8 Zarautz   protected spring     1
sample_metadata[!sample_metadata$sample %in% samples_lowdamr$sample,] %>% count(region, type, season,sort = TRUE)
sample_metadata_filtdamr <- sample_metadata[!sample_metadata$sample %in% samples_lowdamr$sample,]
genome_counts_filtdamr <- genome_counts_filt %>%
  select(c("genome",sample_metadata_filtdamr$sample))%>%
  filter(rowSums(across(where(is.numeric)))!=0)
genome_metadata_filtdamr <- genome_metadata %>%
  semi_join(., genome_counts_filtdamr, by = "genome") %>% 
  arrange(match(genome,genome_counts_filtdamr$genome))
genome_metadata_removed <- genome_metadata %>%
  anti_join(., genome_counts_filtdamr, by = "genome") %>% 
  arrange(match(genome,genome_counts_filtdamr$genome))
genome_tree_filtdamr <- keep.tip(genome_tree, tip=genome_metadata_filtdamr$genome) 
genome_counts_filt_func <- genome_counts_filt[genome_counts_filt$genome %in% rownames(genome_gifts),]
genome_gifts_filtdamr <- genome_gifts[rownames(genome_gifts) %in% genome_counts_filtdamr$genome,]
genome_gifts_filtdamr <- genome_gifts_filtdamr[, colSums(genome_gifts_filtdamr != 0) > 0]
phylum_colors_filtdamr <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata_filtdamr, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree_filtdamr$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)
save(sample_metadata_filtdamr,
     genome_counts_filtdamr,
     genome_metadata_filtdamr,
     genome_tree_filtdamr,
     genome_gifts_filtdamr,
     phylum_colors_filtdamr,
     location_colors,
     type_colors,
     file = "data/data_filtdamr_80.Rdata"
     )